#!/usr/bin/Rscript

library(tm)
library(plyr)

#FIRST RUN "02_13_m5s_forum_import_tables.R"

# Check for database consistency
# orphan_author_alpha <- sapply(author_alpha$url, match, thread$authorUrl)
# orphan_author_alpha <- sum(is.na(orphan_author_alpha))
# orphan_author_beta <- sapply(author_beta$authorUrl, match, comment$authorUrl)
# orphan_author_beta  <- sum(is.na(orphan_author_beta))
# orphan_thread <- sapply(thread$authorUrl, match, author_alpha$url)
# orphan_thread <-  sum(is.na(orphan_thread))
# orphan_comment <- sapply(comment$authorUrl, match, author_beta$authorUrl)
# orphan_comment <-  sum(is.na(orphan_comment))

# Store original tables
raw_author_alpha <- author_alpha
raw_author_beta <- author_beta
raw_comment <- comment
raw_thread <- thread

# Correct authorUrl not corresponding to real name (1 case so far)
author_alpha$url <- gsub("http://www.beppegrillo.it/listeciviche/author/ornella-combi","http://www.beppegrillo.it/listeciviche/author/marino-mastrangeli", author_alpha$url)
thread$authorUrl <- gsub("http://www.beppegrillo.it/listeciviche/author/ornella-combi","http://www.beppegrillo.it/listeciviche/author/marino-mastrangeli", thread$authorUrl)

# Get functions
source(paste(script_path,"fun.R",sep=""))

# Remove duplicates with no comment (twice top->bottom and bottom->top)
dup_threads_removed <- 0

stripped_url <- gsub("(-[0-9]+)?\\.html","",thread$link)
stripped_url_and_author <- data.frame(stripped_url, thread$authorUrl)
dup <- duplicated(stripped_url_and_author)
thread_id <- thread$threadId[which(dup==TRUE)]
to_delete <- sapply(thread_id, whatDupDelete, comment$threadId)
to_delete <- thread_id[which(to_delete==TRUE)]
dup_threads_removed <- dup_threads_removed + length(to_delete)
thread <- thread[ ! thread$threadId %in% to_delete,]

stripped_url <- gsub("(-[0-9]+)?\\.html","",thread$link)
stripped_url_and_author <- data.frame(stripped_url, thread$authorUrl)
dup <- duplicated(stripped_url_and_author, fromLast=TRUE)
thread_id <- thread$threadId[which(dup==TRUE)]
to_delete <- sapply(thread_id, whatDupDelete, comment$threadId)
to_delete <- thread_id[which(to_delete==TRUE)]
dup_threads_removed <- dup_threads_removed + length(to_delete)
thread <- thread[ ! thread$threadId %in% to_delete,]

rm(stripped_url); rm(dup); rm(to_delete); rm(thread_id); rm(stripped_url_and_author)

# Subset on date_cut
thread <- subset(thread,createdAt <= cut_date)
comment <- subset(comment,createdAt <= cut_date)

# Remove authors with no proposal or comment
alpha_unlinked <- sapply(author_alpha$url, match, thread$authorUrl)
author_alpha <- author_alpha[!is.na(alpha_unlinked),]
beta_unlinked <- sapply(author_beta$authorUrl, match, comment$authorUrl)
author_beta <- author_beta[!is.na(beta_unlinked),]
rm(beta_unlinked); rm(alpha_unlinked)

# Define source of data
dictionary_file_english <- "eng_most_freq_words_machine_ready.txt"
dictionary_file_italian <- "ita_most_freq_words_machine_ready.txt"

# Clean tables from coding errors
author_alpha$name <- replaceCodingError(author_alpha$name)
author_beta$name <- replaceCodingError(author_beta$name)
comment$rawMessage <- replaceCodingError(comment$rawMessage)
comment$message <- replaceCodingError(comment$message)
thread$title <- replaceCodingError(thread$title)
thread$message <- replaceCodingError(thread$message)

# Replace long urls used as primary/foreign key
# test <- "http://www.beppegrillo.it/cgi-bin/mt-4/mt-cp.cgi?__mode=view&amp;amp;blog_id=9&amp;amp;id=449"
pattern <- "http://www.beppegrillo.it/cgi-bin/mt-4/mt-cp.cgi?__mode=view&amp;amp;blog_id=9&amp;amp;id="
pattern_escaped <- re.escape(pattern)
replacement <- "id:"
author_beta$authorUrl <- sapply(author_beta$authorUrl, replaceLongUrl, pattern, pattern_escaped, replacement)
comment$authorUrl <- sapply(comment$authorUrl, replaceLongUrl, pattern, pattern_escaped, replacement)
rm(pattern);rm(pattern_escaped);rm(replacement)


# Replace author_beta id when is thread url


# Create vectors of dictionaries
dictionary_english <- read.table(dictionary_file_english, quote="\"")
dictionary_english <- as.character(dictionary_english$V2)
dictionary_italian <- read.table(dictionary_file_italian, quote="\"")
dictionary_italian <- as.character(dictionary_italian$V1)
rm(dictionary_file_english); rm(dictionary_file_italian)

# Dictionary() no longer supported by 'tm' package. Character vector should be ok.
# Create dictionaries (tm package objects) 
# dictionary_english <- Dictionary(dictionary_english)
# dictionary_italian <- Dictionary(dictionary_italian)

# Create a Document-term Matrix with 1 (Ensligh) and 2 (Italian) as threshold, and 2 as minimum lenght of a word to be use by the function
# WARNING: COMPUTATIONALLY INTENSIVE
dtm_english.label <- vapply(comment$rawMessage, getSpamLabel, FUN.VALUE = numeric(1), dictionary_english, 1, 2, USE.NAMES = FALSE)
dtm_italian.label <- vapply(comment$rawMessage, getSpamLabel, FUN.VALUE = numeric(1), dictionary_italian, 2, 2, USE.NAMES = FALSE)
rm(dictionary_english); rm(dictionary_italian)

# Create binary vector for spam (1 for spam, 0 for ham)
spam <- ifelse(dtm_english.label==1 & dtm_italian.label==0, 1, 0)
rm(dtm_english.label); rm(dtm_italian.label) 

# Check flagged comments
# checkFlagged <- subset(comment, spam==1)
# rm(checkFlagged)

# Create vector with authorUrl for spammers and filter genuine authors out
spamAuthor <- comment$authorUrl[which(spam==1)]
pattern_1 = "disqus.com"
pattern_2 = "www.beppegrillo.it"
spamAuthor <- sapply(spamAuthor,filterAuthorOut,pattern_1)
spamAuthor <- sapply(spamAuthor,filterAuthorOut,pattern_2)
#count(spamAuthor=="THIS_IS_HAM")
spamAuthor <- spamAuthor[which(spamAuthor!="THIS_IS_HAM")]
spamAuthor <- unique(spamAuthor)
rm(pattern_1);rm(pattern_2)

# Create dataframe for spam
spam_author <- data.frame()
spam_comment <- data.frame()

# Remove spammers from authors and comments made by spammers 
# It also stores spam authors and comments
# WARNING: COMPUTATIONALLY INTENSIVE
i <- 1; N <- length(spamAuthor)
while (i <= N) {
  spam_author <- rbind(spam_author, author_beta[author_beta$authorUrl == spamAuthor[i], ])
  author_beta <- author_beta[author_beta$authorUrl != spamAuthor[i], ]
  spam_comment <- rbind(spam_comment, comment[comment$authorUrl == spamAuthor[i], ])
  comment <- comment[comment$authorUrl != spamAuthor[i], ]
  i <- i + 1
}
rm(spamAuthor);rm(N);rm(i); rm(spam)
